import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
raw_rental_df = pd.read_parquet('data/clean_housing_data.parquet')
rental_df = raw_rental_df.copy(deep= True)
# Only training/testing model on initial date
initial_date = rental_df['date'].iloc[0]
initial_rental_df = rental_df[rental_df['date'] == initial_date]
# initial_rental_df = initial_rental_df.set_index('id')
initial_rental_df.columns
Index(['id', 'unit_code', 'property_code', 'market_id', 'market_name',
'address_1', 'address_2', 'city', 'state', 'zipcode', 'country', 'beds',
'baths', 'sqft', 'market_rent', 'lat', 'lng', 'available_at', 'unit_id',
'unit_status', 'is_syndicated', 'is_syndicated_ils', 'is_on_special',
'new_construction', 'est_rehab_complete_date', 'rehab_type',
'submarket_names', 'subdivision', 'bid_type', 'asset_review_type',
'days_on_market', 'formattedAddress', 'date', 'has_virtual_tour',
'btr_community', 'model_home'],
dtype='object')
filtered_rental_df = initial_rental_df.loc[:, ['state', 'city', 'market_name', 'submarket_names', 'beds', 'baths', 'sqft', 'days_on_market', 'market_rent']]
filtered_rental_df.head()
| state | city | market_name | submarket_names | beds | baths | sqft | days_on_market | market_rent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | CA | Los Angeles | Southern California | LA Metro West | 3 | 1.0 | 1236 | 0 | 3299.0 |
| 1 | CA | Los Angeles | Southern California | LA Metro West | 3 | 1.0 | 1422 | 9 | 3575.0 |
| 2 | CA | Los Angeles | Southern California | LA Metro West | 2 | 3.0 | 1499 | 0 | 3999.0 |
| 3 | CA | Burbank | Southern California | San Fernando Valley East | 3 | 2.0 | 1500 | 0 | 4399.0 |
| 4 | CA | North Hollywood | Southern California | San Fernando Valley East | 3 | 2.0 | 1553 | 0 | 3899.0 |
# Preprocessing adapted from: https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline_column_transformer.html
X_regressors = filtered_rental_df.drop(columns= 'market_rent')
y_response = filtered_rental_df['market_rent']
categorical_cols = make_column_selector(dtype_include=object)(X_regressors)
numerical_cols = make_column_selector(dtype_include=np.number)(X_regressors)
preprocessor = ColumnTransformer([
('one-hot-encoder', OneHotEncoder(handle_unknown= 'ignore'), categorical_cols),
('standard_scaler', StandardScaler(), numerical_cols)
])
model = Pipeline([
('preprocessor', preprocessor),
('model', RandomForestRegressor(n_estimators=10, random_state=42))
])
X_train, X_test, y_train, y_test = train_test_split(X_regressors, y_response, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean squared error: {:.2f}".format(mse))
print("R-squared score: {:.2f}".format(r2))
Mean squared error: 66066.78 R-squared score: 0.73
model_feature_names = model.named_steps['preprocessor'].get_feature_names_out()
model_feature_importance = model.named_steps['model'].feature_importances_
pd.DataFrame({
'Feature': model_feature_names,
'Importance': model_feature_importance
}).sort_values('Importance', ascending= False).round(4).head(15)
| Feature | Importance | |
|---|---|---|
| 610 | standard_scaler__sqft | 0.2984 |
| 392 | one-hot-encoder__market_name_South Florida/Miami | 0.1812 |
| 1 | one-hot-encoder__state_CA | 0.1324 |
| 3 | one-hot-encoder__state_FL | 0.0428 |
| 391 | one-hot-encoder__market_name_Seattle | 0.0294 |
| 10 | one-hot-encoder__state_WA | 0.0154 |
| 609 | standard_scaler__baths | 0.0137 |
| 2 | one-hot-encoder__state_CO | 0.0133 |
| 608 | standard_scaler__beds | 0.0111 |
| 383 | one-hot-encoder__market_name_Denver | 0.0107 |
| 384 | one-hot-encoder__market_name_Houston | 0.0094 |
| 611 | standard_scaler__days_on_market | 0.0092 |
| 499 | one-hot-encoder__submarket_names_Manatee | 0.0088 |
| 545 | one-hot-encoder__submarket_names_Port St. Lucie | 0.0068 |
| 438 | one-hot-encoder__submarket_names_Denver South | 0.0064 |
filtered_rental_df = initial_rental_df.loc[:, ['market_name', 'submarket_names', 'beds', 'baths', 'sqft', 'market_rent']]
filtered_rental_df.head()
| market_name | submarket_names | beds | baths | sqft | market_rent | |
|---|---|---|---|---|---|---|
| 0 | Southern California | LA Metro West | 3 | 1.0 | 1236 | 3299.0 |
| 1 | Southern California | LA Metro West | 3 | 1.0 | 1422 | 3575.0 |
| 2 | Southern California | LA Metro West | 2 | 3.0 | 1499 | 3999.0 |
| 3 | Southern California | San Fernando Valley East | 3 | 2.0 | 1500 | 4399.0 |
| 4 | Southern California | San Fernando Valley East | 3 | 2.0 | 1553 | 3899.0 |
X_regressors = filtered_rental_df.drop(columns= 'market_rent')
y_response = filtered_rental_df['market_rent']
categorical_cols = make_column_selector(dtype_include=object)(X_regressors)
numerical_cols = make_column_selector(dtype_include=np.number)(X_regressors)
preprocessor = ColumnTransformer([
('one-hot-encoder', OneHotEncoder(handle_unknown= 'ignore'), categorical_cols),
('standard_scaler', StandardScaler(), numerical_cols)
])
model = Pipeline([
('preprocessor', preprocessor),
('model', RandomForestRegressor(n_estimators=10, random_state=42))
])
X_train, X_test, y_train, y_test = train_test_split(X_regressors, y_response, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean squared error: {:.2f}".format(mse))
print("R-squared score: {:.2f}".format(r2))
Mean squared error: 67877.12 R-squared score: 0.72
cv_results = cross_validate(model, X_regressors, y_response, cv= KFold(n_splits= 5, shuffle= True), scoring=('r2', 'neg_mean_squared_error'))
scores = cv_results["test_r2"]
print("The mean cross-validation r2 is: "
f"{scores.mean():.3f} ± {scores.std():.3f}")
The mean cross-validation r2 is: 0.752 ± 0.020
# Higher n_estimators had diminishing returns, increases run time
grid_search_n_estimators = GridSearchCV(model, {'model__n_estimators': [3, 5, 10, 50, 100, 150]}, cv=KFold(n_splits= 5, shuffle= True))
grid_search_n_estimators.fit(X_regressors, y_response)
print("Best hyperparameters:", grid_search_n_estimators.best_params_)
print("Best cross-validation score:", grid_search_n_estimators.best_score_)
Best hyperparameters: {'model__n_estimators': 100}
Best cross-validation score: 0.7718832483585918
cv_n_estimators_results_df = pd.DataFrame({key: grid_search_n_estimators.cv_results_[key] for key in [
'param_model__n_estimators',
'mean_fit_time',
'std_fit_time',
'mean_test_score',
'std_test_score']
})
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Scatter(
name= 'mean_test_score',
x= cv_n_estimators_results_df['param_model__n_estimators'],
y= cv_n_estimators_results_df['mean_test_score'],
),
secondary_y= False
)
fig.add_trace(
go.Scatter(
name= 'mean_fit_time',
x= cv_n_estimators_results_df['param_model__n_estimators'],
y= cv_n_estimators_results_df['mean_fit_time'],
),
secondary_y= True
)
fig.update_layout(
title= "<b>Diminishing Returns of n_estimators</b>",
xaxis_title= "n_estimators",
)
fig.update_yaxes(
title= "Mean Test Score",
secondary_y= False
)
fig.update_yaxes(
title= "Mean Fit Time",
showgrid= False,
secondary_y= True
)
fig.show()
# KFold used instead of StratifiedKFold due to a few low n factors
grid_search = GridSearchCV(model, {
'model__n_estimators': [5, 10, 50],
'model__max_depth': [None, 5, 10],
'model__min_samples_split': [2, 5, 10],
'model__min_samples_leaf': [1, 2, 4],
},
cv=KFold(n_splits= 5, shuffle= True)
)
grid_search.fit(X_regressors, y_response)
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
Best hyperparameters: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 10, 'model__n_estimators': 50}
Best cross-validation score: 0.7750455210043266
# grid_search.cv_results_.keys()
cv_results_df = pd.DataFrame({key: grid_search.cv_results_[key] for key in [
'param_model__max_depth',
'param_model__min_samples_leaf',
'param_model__min_samples_split',
'param_model__n_estimators',
'mean_fit_time',
'std_fit_time',
'mean_test_score',
'std_test_score']
})
cv_results_df.sort_values('mean_test_score', ascending= False).head(10)
| param_model__max_depth | param_model__min_samples_leaf | param_model__min_samples_split | param_model__n_estimators | mean_fit_time | std_fit_time | mean_test_score | std_test_score | |
|---|---|---|---|---|---|---|---|---|
| 8 | None | 1 | 10 | 50 | 1.063368 | 0.103277 | 0.775046 | 0.014673 |
| 5 | None | 1 | 5 | 50 | 1.104402 | 0.013629 | 0.774974 | 0.017345 |
| 2 | None | 1 | 2 | 50 | 1.499969 | 0.018676 | 0.770252 | 0.015971 |
| 17 | None | 2 | 10 | 50 | 0.774774 | 0.010860 | 0.766626 | 0.017218 |
| 14 | None | 2 | 5 | 50 | 0.951253 | 0.036973 | 0.760375 | 0.019956 |
| 7 | None | 1 | 10 | 10 | 0.245263 | 0.050321 | 0.759739 | 0.018483 |
| 11 | None | 2 | 2 | 50 | 1.023605 | 0.039185 | 0.757929 | 0.020903 |
| 4 | None | 1 | 5 | 10 | 0.230825 | 0.004269 | 0.756572 | 0.023369 |
| 1 | None | 1 | 2 | 10 | 0.357890 | 0.048881 | 0.754765 | 0.021080 |
| 16 | None | 2 | 10 | 10 | 0.160986 | 0.001781 | 0.752619 | 0.017151 |
model = Pipeline([
('preprocessor', preprocessor),
('model', RandomForestRegressor(min_samples_split= 5, n_estimators= 50, random_state=42))
])
cv_results = cross_validate(model, X_regressors, y_response, cv= KFold(n_splits= 5, shuffle= True), scoring=('r2', 'neg_mean_squared_error'))
scores = cv_results["test_r2"]
print("The mean cross-validation r2 is: "
f"{scores.mean():.3f} ± {scores.std():.3f}")
The mean cross-validation r2 is: 0.771 ± 0.012
fig = go.Figure()
fig.add_trace(
go.Scatter(
name= 'predictions',
mode='markers',
x= y_test,
y= y_pred
)
)
fig.add_trace(
go.Scatter(
name= 'actual',
mode='lines',
x= y_test,
y= y_test
)
)
fig.update_layout(
title= "<b>Predicted vs Actual Market Rent</b>",
xaxis_title = "Actual Market Rent",
yaxis_title = "Predicted Market Rent"
)
fig.show()